---
title: "Cleaning Second European Union Minorities and Discrimination Survey"
---

# Load

```{r}
# load packages
  source("helper-packages.R")

# load second european union minorities and discrimination survey
  seumads_raw <- 
    read_sav("../raw-data/y-multi-second-european-union-minorities-and-discrimination-survey/ZA6703_v1-0-0.sav")
```

# Clean

```{r}
# declare dates
  seumads_dates <- 
    tribble(
      ~resp_country_common, ~resp_interview_start_date, ~resp_interview_end_date,
      "Austria", "Oct. 21, 2015", "Apr. 14, 2016", # https://fra.europa.eu/sites/default/files/fra_uploads/fra-2017-eu-midis-ii-technical-report_en.pdf; p.70
      "Belgium", "Nov. 03, 2015", "Sep. 14, 2016",
      "Bulgaria", "Nov. 11, 2015", "Feb. 20, 2016",
      "Cyprus", "Oct. 22, 2015", "Mar. 05, 2016",
      "Czechia", "Nov. 25, 2015", "Mar. 06, 2016",
      "Germany", "Oct. 24, 2015", "Mar. 14, 2016",
      "Denmark", "Oct. 16, 2015", "May. 31, 2016",
      "Estonia", "Dec. 07, 2015", "Apr. 26, 2016",
      "Greece", "Nov. 07, 2015", "Feb. 04, 2016", #countrycode EL
      "Spain", "Nov. 02, 2015", "Apr. 07, 2016",
      "Finland", "Nov. 02, 2015", "Aug. 09, 2016",
      "France", "Nov. 06, 2015", "Nov. 11, 2016",
      "Croatia", "Nov. 18, 2015", "Feb. 29, 2016", #countrycode HR
      "Hungary", "Oct. 28, 2015", "Mar. 13, 2016",
      "Ireland", "Oct. 26, 2015", "May. 09, 2016",
      "Italy", "Oct. 30, 2015", "Mar. 26, 2016",
      "Latvia", "Nov. 28, 2015", "Apr. 11, 2016",
      "Lithuania", "Mar. 16, 2016", "May. 09, 2016",
      "Luxembourg", "Oct. 31, 2015", "Feb. 05, 2016",
      "Malta", "Nov. 18, 2015", "Jan. 26, 2016",
      "Netherlands", "Jan. 13, 2016", "Aug. 07, 2016",
      "Poland", "Nov. 28, 2015", "Mar. 31, 2016",
      "Portugal", "Nov. 10, 2015", "Mar. 03, 2016",
      "Romania", "Oct. 22, 2015", "Mar. 18, 2016",
      "Sweden", "Nov. 2, 2015", "Mar. 30, 2016", #SE
      "Slovenia", "Nov. 26, 2015", "Mar. 04, 2016", #SI
      "Slovakia", "Nov. 27, 2015", "Feb. 29, 2016", #SK
      "UK", "Sep. 24, 2015", "Apr. 24, 2016") %>% 
    mutate(
      resp_interview_start_date = as.Date(resp_interview_start_date, "%b. %d, %Y"),
      resp_interview_end_date = as.Date(resp_interview_end_date, "%b. %d, %Y"))

# clean
  clean_seumads <- 
    seumads_raw %>% 
    mutate(
      
    #########################  
    ####### META-DATA #######  
    #########################      
      
      # source name (character vector, title case)
        resp_source = "Second European Union Minorities and Discrimination Survey",
    
      # url to dataset source, where publicly available (character vector)
        resp_original_data_url = "bit.ly/3t5LWcN",

      # survey mode (in-person/phone/internet)
        resp_survey_mode = "in-person",    

      # country (character vector; list of countries as written in original source)
        resp_country_original = to_character(country),

      # country (character vector; converts to countrycode county.name list)
        resp_country_common = 
          countryname(resp_country_original),
      
     # interview date (variable of class Date; if only month given, input 1st of month)
        resp_interview_date = NA) %>% 
        left_join(
          seumads_dates, by = "resp_country_common") %>% 
        mutate(      
   
    #########################  
    ##### DEMOGRAPHICS ######  
    #########################
      
      # respondent's religion (character vector that corresponds to master list)
        resp_religion = 
          dplyr::recode(
            as.character(PB01),
            "0" = NA_character_,
            "1" = "Christian",
            "2" = "Muslim",
            "3" = "Jewish",
            "4" = "Hindu",
            "5" = "Sikh",
            "6" = "Buddhist",
            "7" = "Other religion",
            "96" = NA_character_,
            "97" = NA_character_,
            "99" = "Other religion"),  

      # respondent's religion (character vector that corresponds to master list)
        resp_denomination =
          dplyr::recode(
            as.character(PB01),
            "0" = NA_character_,
            "1" = "Christian (Catholic, Protestant, Orthodox, Evangelic, Old Catholic, etc.)",
            "2" = "Muslim (Shia, Sunni, Sufi, etc.)",
            "3" = "Jewish",
            "4" = "Hindu",
            "5" = "Sikh",
            "6" = "Buddhist",
            "7" = "Other religion",
            "96" = NA_character_,
            "97" = NA_character_,
            "99" = "Other religion"),   
    
      # respondent's age (character vector; bins denoted by single dash ["18-25"])
        resp_age =
          dplyr::recode(
            as.character(HH02),
            "996" = NA_character_,
            "997" = NA_character_,
            "999" = NA_character_), #note, there are children in the sample (< 16); this appears to be a roster and nobody below age 16 is asked the social distance questions        
      
      # respondent's education level
        resp_education_original =
          dplyr::recode(
            as.character(EDU_achieved), # this seems to be the most consolidated edu variable with least missingness
            "1" = "Never been in formal education / Never completed primary education (/or level in COUNTRY yet) (ISCED 0) [No education]",
            "2" = "Primary and lower secondary education (ISCED 1+2) [Primary]",
            "3" = "Upper secondary, vocational, post-secondary, short cycle tertiary education (ISCED 3 to 5) [Primary]",
            "4" = "Tertiary education (ISCED 6-8) [College]",
            .default = NA_character_),       
      
      # respondent's gender (numeric: female = 1; male = 0; other = NA)
        resp_female = 
          case_when(
            HH03 == 1 ~ 0,
            HH03 == 2 ~ 1,
            TRUE ~ NA_real_),
      
      # respondent resident in rural (vs urban) area (numeric: rural = 1; urban/semi-urban/peri-urban = 0)
        resp_rural = 
          case_when(
            DEGURBA == 3 ~ 1, #Thinly populated area
            DEGURBA %in% c(1:2) ~ 0, # Densely populated area, Intermediate density area
            TRUE ~ NA_real_),
      
    #########################  
    ### SOCIAL DISTANCE 1 ###  
    #########################
    
      # original question number; question text; response options (input above)
        resp_soc_dist_1_qinfo = "NUM: PB12.1; QTEXT: Using a scale from 0 to 10, please tell me how you would feel about having someone from one of the following groups as your neighbour? A person who has a different religion than yours.; ROPTIONS: 0(1/2/3/4) = Uncomfortable [=1] + (5/6/7/8/9)10 = Comfortable [=0]; TARGET: Different religion; TYPE: Distance, neighbor",
      
      # original response (as character vector)
        resp_soc_dist_1_original =
          dplyr::recode(
            as.character(PB12_1),
            "96" = NA_character_,
            "97" = NA_character_,
            "99" = NA_character_),            

      # binary recode (numeric: 1 = any negative attitude expressed; 0 otherwise)
        resp_soc_dist_1_bin_recode =
          case_when(
            PB12_1 %in% c(0:4) ~ 1,
            PB12_1 %in% c(5:10) ~ 0,
            TRUE ~ NA_real_),

    #########################  
    ### SOCIAL DISTANCE 2 ###  
    #########################
    
      # original question number; question text; response options (input above)
        resp_soc_dist_2_qinfo = "NUM: PB13.1; QTEXT: Using a scale from 0 to 10, please tell me how you would feel about someone from your family being married to a person from one of the following groups? A person who has a different religion than yours.; ROPTIONS: 0(1/2/3/4) = Uncomfortable [=1] + (5/6/7/8/9)10 = Comfortable [=0]; TARGET: Different religion; TYPE: Distance, family",
     
      # original response (as character vector)
        resp_soc_dist_2_original =
          dplyr::recode(
            as.character(PB13_1),
            "96" = NA_character_,
            "97" = NA_character_,
            "99" = NA_character_),            

      # binary recode (numeric: 1 = any negative attitude expressed; 0 otherwise)
        resp_soc_dist_2_bin_recode =
          case_when(
            PB13_1 %in% c(0:4) ~ 1,
            PB13_1 %in% c(5:10) ~ 0,
            TRUE ~ NA_real_),
    
    ############################  
    ### GENERAL SOCIAL TRUST ###  
    ############################
    
      # original question number; question text; response options (input above)
        resp_gentrust_qinfo = NA, # checked; no general trust question

      # original response (as character vector)
        resp_gentrust_original = NA,       

      # binary recode (numeric: 1 = any negative attitude expressed; 0 otherwise)
        resp_gentrust_bin_recode = NA,
    
    #########################  
    ##### RELIGIOSITY #######  
    #########################
    
      # original question number; question text; response options (input above)
        resp_religiosity_qinfo = "NUM: PB03; QTEXT: To what extent do you feel (Christian/Muslim/PB01 answer)?; ROPTIONS: 1 = Not at all + 5 = Very strongly",
  
      # original response (as numeric vector, with non-substantive responses coded as NA_real_)
        resp_religiosity_original = 
          case_when(
            PB03 %in% c(96, 97, 99) ~ NA_real_,
            TRUE ~ as.numeric(PB03)),       

      # recode (numeric: scaled 0-1, where 1 is more religious)
        resp_religiosity_recode = (resp_religiosity_original - 1)/4
    
    ) %>% 
    select(starts_with("resp_"))
```

# Save data

```{r}
  saveRDS(clean_seumads, "../cleaned-data/y-19-multi-second-european-union-minorities-and-discrimination-survey.rds")
```
